import sys
import os
import re
import codecs

# help me wirte codes, match the minimum content between <CONTEXT> and </CONTEXT>
# codes as follows:


def get_kws(context):
    start_flag = r"<CONTEXT>"
    end_flag = r"</CONTEXT>"
    find_results = []
    i = 0
    start_found, end_found = False, False
    temp_context = []
    while i < len(context):
        if context[i].startswith(start_flag) and not end_found:
            if not start_found:
                start_found = True
            else:
                temp_context.clear()
                start_found = True
            i += 1
            continue
        if start_found:
            if context[i].startswith(start_flag) and not end_found:
                temp_context.clear()
            if context[i].startswith(end_flag) and start_found:
                kws = " ".join(temp_context)
                temp_context.clear()
                i += 1
                find_results.append(kws)
                start_found = False
                continue
            temp_context.append(context[i])
        i = i + 1
    return find_results


def remove(filename):
    filtered_lines = []
    reg_rule = r"<CONTEXT> (.*?) </CONTEXT>"
    start_rule = r"<CONTEXT>"
    end_rule = r"</CONTEXT>"
    patten_in = re.compile(r"<CONTEXT> (.*?) </CONTEXT>")
    with codecs.open(filename, 'r') as f, codecs.open(filename + ".filtered.txt", 'w') as fr:
        for line in f:
            units = re.split(r'\s+', line.strip())
            print(line.strip())
            if start_rule not in units and end_rule not in units:
                print(line)
            else:
                kws_list = get_kws(units[1:])
                if len(kws_list) == 1:
                    print("{}\t{}".format(units[0], kws_list[0]))
                    fr.write("{}\t{}\n".format(units[0], kws_list[0]))
                else:
                    print("{}\t{}".format(units[0], " | ".join(kws_list)))
                    fr.write("{}\t{}\n".format(units[0], " | ".join(kws_list)))


if __name__ == '__main__':
    remove(sys.argv[1])
    # remove("examples/multi_cn/s0/south_3k/result.txt.3.27")
